@@ -6,6 +6,11 @@ module Agents |
||
| 6 | 6 |
class WebsiteAgent < Agent |
| 7 | 7 |
cannot_receive_events! |
| 8 | 8 |
|
| 9 |
+ default_schedule "every_12h" |
|
| 10 |
+ |
|
| 11 |
+ UNIQUENESS_LOOK_BACK = 200 |
|
| 12 |
+ UNIQUENESS_FACTOR = 3 |
|
| 13 |
+ |
|
| 9 | 14 |
description <<-MD |
| 10 | 15 |
The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results. |
| 11 | 16 |
|
@@ -34,17 +39,15 @@ module Agents |
||
| 34 | 39 |
|
| 35 | 40 |
Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`. |
| 36 | 41 |
|
| 37 |
- Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. |
|
| 42 |
+ Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. This is only used to set the "working" status. |
|
| 43 |
+ |
|
| 44 |
+ Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
|
|
| 38 | 45 |
MD |
| 39 | 46 |
|
| 40 | 47 |
event_description do |
| 41 | 48 |
"Events will have the fields you specified. Your options look like:\n\n #{Utils.pretty_print options['extract']}"
|
| 42 | 49 |
end |
| 43 | 50 |
|
| 44 |
- default_schedule "every_12h" |
|
| 45 |
- |
|
| 46 |
- UNIQUENESS_LOOK_BACK = 30 |
|
| 47 |
- |
|
| 48 | 51 |
def working? |
| 49 | 52 |
event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs? |
| 50 | 53 |
end |
@@ -54,7 +57,7 @@ module Agents |
||
| 54 | 57 |
'expected_update_period_in_days' => "2", |
| 55 | 58 |
'url' => "http://xkcd.com", |
| 56 | 59 |
'type' => "html", |
| 57 |
- 'mode' => :on_change, |
|
| 60 |
+ 'mode' => "on_change", |
|
| 58 | 61 |
'extract' => {
|
| 59 | 62 |
'url' => {'css' => "#comic img", 'attr' => "src"},
|
| 60 | 63 |
'title' => {'css' => "#comic img", 'attr' => "title"}
|
@@ -63,31 +66,44 @@ module Agents |
||
| 63 | 66 |
end |
| 64 | 67 |
|
| 65 | 68 |
def validate_options |
| 69 |
+ # Check for required fields |
|
| 66 | 70 |
errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present? |
| 67 | 71 |
if !options['extract'].present? && extraction_type != "json" |
| 68 | 72 |
errors.add(:base, "extract is required for all types except json") |
| 69 | 73 |
end |
| 74 |
+ |
|
| 75 |
+ # Check for optional fields |
|
| 76 |
+ if options['mode'].present? |
|
| 77 |
+ errors.add(:base, "mode must be set to on_change or all") unless %w[on_change all].include?(options['mode']) |
|
| 78 |
+ end |
|
| 79 |
+ |
|
| 80 |
+ if options['expected_update_period_in_days'].present? |
|
| 81 |
+ errors.add(:base, "Invalid expected_update_period_in_days format") unless is_positive_integer?(options['expected_update_period_in_days']) |
|
| 82 |
+ end |
|
| 83 |
+ |
|
| 84 |
+ if options['uniqueness_look_back'].present? |
|
| 85 |
+ errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back']) |
|
| 86 |
+ end |
|
| 70 | 87 |
end |
| 71 | 88 |
|
| 72 | 89 |
def check |
| 73 | 90 |
hydra = Typhoeus::Hydra.new |
| 74 | 91 |
log "Fetching #{options['url']}"
|
| 75 |
- request_opts = {:followlocation => true}
|
|
| 76 |
- if options['basic_auth'].present? |
|
| 77 |
- request_opts[:userpwd] = options['basic_auth'] |
|
| 78 |
- end |
|
| 92 |
+ request_opts = { :followlocation => true }
|
|
| 93 |
+ request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present? |
|
| 79 | 94 |
request = Typhoeus::Request.new(options['url'], request_opts) |
| 95 |
+ |
|
| 80 | 96 |
request.on_failure do |response| |
| 81 | 97 |
error "Failed: #{response.inspect}"
|
| 82 | 98 |
end |
| 99 |
+ |
|
| 83 | 100 |
request.on_success do |response| |
| 84 | 101 |
doc = parse(response.body) |
| 85 | 102 |
|
| 86 | 103 |
if extract_full_json? |
| 87 |
- result = doc |
|
| 88 |
- if store_payload? result |
|
| 89 |
- log "Storing new result for '#{name}': #{result.inspect}"
|
|
| 90 |
- create_event :payload => result |
|
| 104 |
+ if store_payload!(previous_payloads(1), doc) |
|
| 105 |
+ log "Storing new result for '#{name}': #{doc.inspect}"
|
|
| 106 |
+ create_event :payload => doc |
|
| 91 | 107 |
end |
| 92 | 108 |
else |
| 93 | 109 |
output = {}
|
@@ -116,6 +132,7 @@ module Agents |
||
| 116 | 132 |
return |
| 117 | 133 |
end |
| 118 | 134 |
|
| 135 |
+ old_events = previous_payloads num_unique_lengths.first |
|
| 119 | 136 |
num_unique_lengths.first.times do |index| |
| 120 | 137 |
result = {}
|
| 121 | 138 |
options['extract'].keys.each do |name| |
@@ -125,7 +142,7 @@ module Agents |
||
| 125 | 142 |
end |
| 126 | 143 |
end |
| 127 | 144 |
|
| 128 |
- if store_payload? result |
|
| 145 |
+ if store_payload!(old_events, result) |
|
| 129 | 146 |
log "Storing new parsed result for '#{name}': #{result.inspect}"
|
| 130 | 147 |
create_event :payload => result |
| 131 | 148 |
end |
@@ -138,16 +155,43 @@ module Agents |
||
| 138 | 155 |
|
| 139 | 156 |
private |
| 140 | 157 |
|
| 141 |
- def store_payload? result |
|
| 142 |
- !options['mode'] || options['mode'].to_s == "all" || (options['mode'].to_s == "on_change" && !previous_payloads.include?(result.to_json)) |
|
| 158 |
+ # This method returns true if the result should be stored as a new event. |
|
| 159 |
+ # If mode is set to 'on_change', this method may return false and update an existing |
|
| 160 |
+ # event to expire further in the future. |
|
| 161 |
+ def store_payload!(old_events, result) |
|
| 162 |
+ if !options['mode'].present? |
|
| 163 |
+ return true |
|
| 164 |
+ elsif options['mode'].to_s == "all" |
|
| 165 |
+ return true |
|
| 166 |
+ elsif options['mode'].to_s == "on_change" |
|
| 167 |
+ result_json = result.to_json |
|
| 168 |
+ old_events.each do |old_event| |
|
| 169 |
+ if old_event.payload.to_json == result_json |
|
| 170 |
+ old_event.expires_at = new_event_expiration_date |
|
| 171 |
+ old_event.save! |
|
| 172 |
+ return false |
|
| 173 |
+ end |
|
| 174 |
+ end |
|
| 175 |
+ return true |
|
| 176 |
+ end |
|
| 177 |
+ raise "Illegal options[mode]: " + options['mode'].to_s |
|
| 143 | 178 |
end |
| 144 | 179 |
|
| 145 |
- def previous_payloads |
|
| 146 |
- events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options['mode'].to_s == "on_change"
|
|
| 180 |
+ def previous_payloads(num_events) |
|
| 181 |
+ if options['uniqueness_look_back'].present? |
|
| 182 |
+ look_back = options['uniqueness_look_back'].to_i |
|
| 183 |
+ else |
|
| 184 |
+ # Larger of UNIQUENESS_FACTOR * num_events and UNIQUENESS_LOOK_BACK |
|
| 185 |
+ look_back = UNIQUENESS_FACTOR * num_events |
|
| 186 |
+ if look_back < UNIQUENESS_LOOK_BACK |
|
| 187 |
+ look_back = UNIQUENESS_LOOK_BACK |
|
| 188 |
+ end |
|
| 189 |
+ end |
|
| 190 |
+ events.order("id desc").limit(look_back) if options['mode'].present? && options['mode'].to_s == "on_change"
|
|
| 147 | 191 |
end |
| 148 | 192 |
|
| 149 | 193 |
def extract_full_json? |
| 150 |
- (!options['extract'].present? && extraction_type == "json") |
|
| 194 |
+ !options['extract'].present? && extraction_type == "json" |
|
| 151 | 195 |
end |
| 152 | 196 |
|
| 153 | 197 |
def extraction_type |
@@ -174,5 +218,13 @@ module Agents |
||
| 174 | 218 |
raise "Unknown extraction type #{extraction_type}"
|
| 175 | 219 |
end |
| 176 | 220 |
end |
| 221 |
+ |
|
| 222 |
+ def is_positive_integer?(value) |
|
| 223 |
+ begin |
|
| 224 |
+ Integer(value) >= 0 |
|
| 225 |
+ rescue |
|
| 226 |
+ false |
|
| 227 |
+ end |
|
| 228 |
+ end |
|
| 177 | 229 |
end |
| 178 | 230 |
end |
@@ -15,15 +15,31 @@ describe Agents::WebsiteAgent do |
||
| 15 | 15 |
'title' => {'css' => "#comic img", 'attr' => "title"}
|
| 16 | 16 |
} |
| 17 | 17 |
} |
| 18 |
- @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site) |
|
| 18 |
+ @checker = Agents::WebsiteAgent.new(:name => "xkcd", :options => @site, :keep_events_for => 2) |
|
| 19 | 19 |
@checker.user = users(:bob) |
| 20 | 20 |
@checker.save! |
| 21 | 21 |
end |
| 22 | 22 |
|
| 23 | 23 |
describe "#check" do |
| 24 |
- it "should check for changes" do |
|
| 24 |
+ |
|
| 25 |
+ it "should validate the integer fields" do |
|
| 26 |
+ @checker.options['expected_update_period_in_days'] = "nonsense" |
|
| 27 |
+ lambda { @checker.save! }.should raise_error;
|
|
| 28 |
+ @checker.options['expected_update_period_in_days'] = "2" |
|
| 29 |
+ @checker.options['uniqueness_look_back'] = "nonsense" |
|
| 30 |
+ lambda { @checker.save! }.should raise_error;
|
|
| 31 |
+ @checker.options['mode'] = "nonsense" |
|
| 32 |
+ lambda { @checker.save! }.should raise_error;
|
|
| 33 |
+ @checker.options = @site |
|
| 34 |
+ end |
|
| 35 |
+ |
|
| 36 |
+ it "should check for changes (and update Event.expires_at)" do |
|
| 25 | 37 |
lambda { @checker.check }.should change { Event.count }.by(1)
|
| 38 |
+ event = Event.last |
|
| 39 |
+ sleep 2 |
|
| 26 | 40 |
lambda { @checker.check }.should_not change { Event.count }
|
| 41 |
+ update_event = Event.last |
|
| 42 |
+ update_event.expires_at.should_not == event.expires_at |
|
| 27 | 43 |
end |
| 28 | 44 |
|
| 29 | 45 |
it "should always save events when in :all mode" do |
@@ -35,6 +51,30 @@ describe Agents::WebsiteAgent do |
||
| 35 | 51 |
}.should change { Event.count }.by(2)
|
| 36 | 52 |
end |
| 37 | 53 |
|
| 54 |
+ it "should take uniqueness_look_back into account during deduplication" do |
|
| 55 |
+ @site['mode'] = 'all' |
|
| 56 |
+ @checker.options = @site |
|
| 57 |
+ @checker.check |
|
| 58 |
+ @checker.check |
|
| 59 |
+ event = Event.last |
|
| 60 |
+ event.payload = "{}"
|
|
| 61 |
+ event.save |
|
| 62 |
+ |
|
| 63 |
+ lambda {
|
|
| 64 |
+ @site['mode'] = 'on_change' |
|
| 65 |
+ @site['uniqueness_look_back'] = 2 |
|
| 66 |
+ @checker.options = @site |
|
| 67 |
+ @checker.check |
|
| 68 |
+ }.should_not change { Event.count }
|
|
| 69 |
+ |
|
| 70 |
+ lambda {
|
|
| 71 |
+ @site['mode'] = 'on_change' |
|
| 72 |
+ @site['uniqueness_look_back'] = 1 |
|
| 73 |
+ @checker.options = @site |
|
| 74 |
+ @checker.check |
|
| 75 |
+ }.should change { Event.count }.by(1)
|
|
| 76 |
+ end |
|
| 77 |
+ |
|
| 38 | 78 |
it "should log an error if the number of results for a set of extraction patterns differs" do |
| 39 | 79 |
@site['extract']['url']['css'] = "div" |
| 40 | 80 |
@checker.options = @site |
@@ -79,7 +119,7 @@ describe Agents::WebsiteAgent do |
||
| 79 | 119 |
'expected_update_period_in_days' => 2, |
| 80 | 120 |
'type' => "html", |
| 81 | 121 |
'url' => "http://xkcd.com", |
| 82 |
- 'mode' => :on_change, |
|
| 122 |
+ 'mode' => "on_change", |
|
| 83 | 123 |
'extract' => {
|
| 84 | 124 |
'url' => {'css' => "#topLeft a", 'attr' => "href"},
|
| 85 | 125 |
'title' => {'css' => "#topLeft a", 'text' => "true"}
|
@@ -216,5 +256,4 @@ describe Agents::WebsiteAgent do |
||
| 216 | 256 |
end |
| 217 | 257 |
end |
| 218 | 258 |
end |
| 219 |
- |
|
| 220 |
-end |
|
| 259 |
+end |